## DIME Merging, take 3
library(tidyverse);library(foreign)
library(reshape2); library(data.table)
setwd("~/shared_space/pcds/other/counties")
# setwd("~/shared_space/nd_jdbk/Bonica data")

## pre-processing contributor data into data.table format:
# b <- fread("~/shared_space/nd_jdbk/Bonica data/contributor data/contribDB_2006.csv")
# if(class(b)[1]!="data.table"){b <- data.table(b)}
# save(b,file="~/shared_space/nd_jdbk/Bonica data/contributor data/contribDB_2006.Rdata")

# zipcode crosswalk file:
zips <- read_csv("zips_to_counties.csv")[-1,]
zips <- zips[-which(zips$zcta5=="99999"),]

## elections data:
elecs <- read.dta("council_all.dta")
cands <- elecs %>%
  select(fips,county,state,district,year,grep("cand\\d{1,2}$",names(elecs))) %>%
  melt(id.vars = c("fips","county","state","district","year"))
cands$value[cands$value=="NA"] <- NA
cands <- cands %>%
  filter(!is.na(value))
# fixing year errors:
cands$year[which(cands$year == "20006")] <- "2006"
cands$year[which(cands$year == "2996")] <- "2006"

# for multimember candidates:
elecs2 <- read.csv("counties_multimember.csv",stringsAsFactors=F)[,-1]
cands <- elecs2 %>%
  select(fips,county,state,district,year = elecyear,grep("cand\\d{1,2}$",names(elecs2))) %>%
  melt(id.vars = c("fips","county","state","district","year"))
cands$value[cands$value=="NA"] <- NA
cands <- cands %>%
  filter(!is.na(value))


# Fixing error-prone multi-byte character names:
cands$value <- iconv(cands$value)
# cands$value[cands$value=="Joan M. Voss\xbe"] <- "Joan M. Voss" 
# cands$value[9479] <- "Louis Cappelli Jr.(I)"
# cands$value[9519] <- "Joseph Derella Jr."
# cands$value[11403] <- "Carrie Solages"
# cands$value[14874] <- "Bernice Scot"
# cands$value[14876] <- "Cecil Tillis"
# cands$value[16091] <- "Mike Stewart"
# cands$value[25566] <- "Jerry Carrington"
# cands$value[26235] <- "Douglas Long"
# cands$value[26256] <- "Patricia SEBOLD"
# cands$value[26257] <- "Jeannette VERA"
# cands$value[26341] <- "Paul H. CASTELLI"
# cands$value[30884] <- "Corey D. OBrien"
# cands$value[31590] <- "Mary Hopkins Anderson"
# cands$value[32808] <- "Dale F. Gardiner"

cands$name <- gsub("\\(I\\)","",x=cands$value) # get rid of incumbency tag
cands$name <- gsub("\\.","",x=tolower(cands$name)) # format the same as contributions data, no periods after MI
cands$name_noMI <- gsub("^(\\w+) .*?(\\w+)$","\\1 \\2",x=cands$name) # only firstname/lastname, no MI
cands$lastname <- gsub(".*?(\\w+)$","\\1",x=cands$name) # only lastname, no MI or FI
cands$finitial <- gsub("^(\\S{1}).*","\\1",cands$name) # FI

# Convert place name to lowercase county only for matching later:
cands$county <- gsub( "\\.", "",cands$county)
cands$county <- gsub( " County", "",cands$county)
cands$county <- tolower(cands$county)

# convert to DT for optimization:
cands <- as.data.table(cands)
#####################################################
##### Matching elections + Bonica contributors: #####
#####################################################


allmatches <- NULL

newfiles <- list.files("~/shared_space/nd_jdbk/Bonica data/contributor data",recursive=T,pattern="Rdata$", full.names = T)
for(j in 1:length(newfiles)){
  load(newfiles[j])
  # progress update:
  print(paste0("Bonica data: starting ",j," of 17"))
  
  # setting up contributor data
  if(class(b)[1]!="data.table"){a <- data.table(b)}
  if(class(b)[1]=="data.table"){a <- b}
  rm(b)
  a$FirstNameMILastName <- paste(a$contributor_fname, a$contributor_mname,a$contributor_lname,sep=" ") # all lowercase, concatenated for matching
  var_list <- c("cycle",
                "bonica_cid",
                "FirstNameMILastName", # lowercase
                "contributor_fname",
                "contributor_lname",
                "contributor_address",
                "contributor_city",
                "contributor_zipcode",
                "contributor_state", # two-letter cap abbrev
                "contributor_occupation", 
                "contributor_cfscore")
  a <- a %>%
    select(var_list)
  
  colnames(a)
  a$bonica_rid <- NA
  a$cand <- 0
  setnames(a, c("cycle",
                "bonica_cid",
                "FirstNameMILastName",
                "fname",
                "lname",
                "address",
                "city",
                "zipcode",
                "state", # two-letter cap abbrev
                "occupation", 
                "cfscore",
                "bonica_rid",
                "cand"))
  bonica <- a
  rm(a)
  
  # create alternate name w/o MI for matching:
  bonica$firstnamelastname <- paste(bonica$fname,bonica$lname,sep=" ") # without MI
  bonica$finitial <- gsub("^(\\S{1}).*","\\1",bonica$fname) # FI
  
  # subset to those in elections data by checking either name w/ MI, version w/o MI, or just lastname:
  bonica <- bonica%>%
    filter(FirstNameMILastName %in% c(cands$name,cands$name_noMI) | 
             firstnamelastname %in% c(cands$name_noMI,cands$name) |
             lname %in% cands$lastname)
  bonica2 <- as.data.table(bonica[!duplicated(bonica[,c("cycle","bonica_cid","FirstNameMILastName","fname","lname","city","zipcode","state","occupation","cfscore","bonica_rid","cand","firstnamelastname")]),]) # keep unique values for everything but address, discard extra addresses if everything else the same (eliminates 23k)
  
  thismatch <- NULL
  winnermatches <- NULL
  for(i in 1:nrow(cands)){
    # progress update:
    cat("\r",paste0("Candidates: matching ",i," of ",nrow(cands)," candidates"))
    
    # get matching zipcodes for county:
    matchzips <- NULL
    matchzips <- zips %>%
      filter(as.numeric(county) == cands$fips[i]) %>%
      select(zcta5)
    #     [as.numeric(zips$county)==cands$fips[i]]
    
    
    # match on zip codes from given county/State AND name, either with/without MI
    winnermatches <- bonica2 %>%
      filter(zipcode %in% matchzips$zcta5 & # geo matching
               #                state == as.character(cands$state[i]) & 
               (FirstNameMILastName == cands$name[i] |  # name matching:
                  FirstNameMILastName==cands$name_noMI[i] | 
                  firstnamelastname==cands$name[i] | 
                  firstnamelastname==cands$name_noMI[i] | 
                  (finitial == cands$finitial[i] & lname == cands$lastname[i])
               ))
    if(nrow(winnermatches)>=1){
      thismatch <- cbind(winnermatches,cands[i,c("fips","county","state","district","year","variable","value","name","name_noMI","lastname")])
      allmatches <- rbind(allmatches,thismatch)
      
    }
    if(i%%1000 == 0){ # save results every 1000
      save(allmatches,file="allmatches_mm_partial.Rdata")
    }
  }
}

# check that this worked:
table(allmatches$cycle)
table(allmatches$year)
# save(allmatches,file="allmatches_second.Rdata")
# allmatches2 <- allmatches
# load(file="allmatches_first.Rdata")
# allmatches <- rbind(allmatches, allmatches2)
# save(allmatches,file="allmatches_full.Rdata")
# load(file="allmatches_full.Rdata")
save(allmatches,file="allmatches_mm_full.Rdata")
load(file="allmatches_mm_full.Rdata")

# eliminate duplicates based on everything except address
# allmatches <- allmatches[!duplicated(allmatches[,c(1:12,14,15,17)]),]
names(allmatches)[9] <- "state_abb" # to distinguish from other "state" column
allmatches$cfscore <- as.numeric(allmatches$cfscore)
allmatches <- allmatches[!duplicated(allmatches[,c("cycle","bonica_cid","FirstNameMILastName","fname","lname","city","zipcode","state_abb","occupation","cfscore","bonica_rid","cand","firstnamelastname","finitial","fips","county","state","district","year","variable","value","name","name_noMI","lastname")]),]
length(unique(allmatches$FirstNameMILastName)) # 30931 unique candidates

## now match elecs names to CF-scores in the allmatches file
# want to keep in elecs for candidate: cfscore, rid, cid

cands$cfscore <- as.numeric(rep(NA,nrow(cands)))
cands$dime_cid <- as.numeric(rep(NA,nrow(cands)))
cands$dime_yearmatch <- as.numeric(rep(NA,nrow(cands)))

yearmax <- NULL
yearmin <- NULL

manualcheck <- NULL
namematches <- NULL


for(i in 1:nrow(cands)){
  # progress update:
  cat("\r",paste0("Candidates: matching ",i," of ",nrow(cands)," candidates"))
  
  ## Winner:
  winnermatches <- allmatches %>%
    filter(fips == cands$fips[i] & 
             (FirstNameMILastName == cands$name[i] |  # name matching:
                FirstNameMILastName==cands$name_noMI[i] | 
                firstnamelastname==cands$name[i] | 
                firstnamelastname==cands$name_noMI[i] | 
                (finitial == cands$finitial[i] & lname == cands$lastname[i])
                                       ))
  elecwinnermatches <- cands[which(cands$fips==cands$fips[i] & cands$lastname==cands$lastname[i]),]
  yearmax <- max(as.numeric(elecwinnermatches$year))+6 # within three election cycles of the county election campaign where this candidate ran
  yearmin <- min(as.numeric(elecwinnermatches$year))-6
  winnermatches <- winnermatches[which(winnermatches$cycle <= yearmax & winnermatches$cycle >= yearmin),]
  
  # if no matches, do nothing
  # if there IS a match/matches:
  if(nrow(winnermatches)>=1){
    winnermatches <- winnermatches[!duplicated(winnermatches[,c("cycle","bonica_cid","occupation","cfscore")]),]
    # if there is only one unique cfscore:
    if(length(unique(winnermatches$cfscore))==1){
      # set election winner info equal to first matching row
      cands[i,c("cfscore","dime_cid","dime_yearmatch")] <- winnermatches[1,c("cfscore","bonica_cid","cycle")]
    }
    # if more than one possible cfscore:
    if(length(unique(winnermatches$cfscore))>1){
      thisduplicate <- cbind(winnermatches[,c("cycle","bonica_cid","lname","fname","address","city","zipcode","state_abb",
                                        "occupation","cfscore","FirstNameMILastName","bonica_rid","cand","firstnamelastname",
                                        "fips","county","state","district","year","variable")],cands[i,"name"])
      colnames(thisduplicate) <- c("cycle","bonica_cid","lname","fname","address","city","zipcode","state_abb",
                                   "occupation","cfscore","FirstNameMILastName","bonica_rid","cand","firstnamelastname",
                                   "fips","county","state","district","year","variable","elec_name")
      manualcheck <- rbind(manualcheck,thisduplicate)
    }
  }
  if(i%%1000 == 0){ # save results every 1000
    write.csv(cands,file="county_elections_cfscores_mm_partial.csv",row.names=F)
  }
  
}



sum(!is.na(cands$cfscore)) # 7459 matched fully
length(unique(cands$name[which(!is.na(cands$cfscore) & cands$variable=="cand1")])) # 4475 unique candidates that matched fully, 2316 winners
# writeLines(text = as.character(100*round(length(unique(cands$name[which(!is.na(cands$cfscore) & cands$variable=="cand1")]))/length(unique(cands$name[which(cands$variable=="cand1")])),2)),con = "perc_winners_cfmatched.tex")
# writeLines(text = as.character(length(unique(cands$name[which(!is.na(cands$cfscore) & cands$variable=="cand1")]))),con = "n_winners_cfmatched.tex")
# writeLines(text = as.character(length(unique(cands$name[which(cands$variable=="cand1")]))),con = "n_winners.tex")
# writeLines(text = as.character(round(max(cands$cfscore,na.rm=T),1)),con="max_cfscore.tex")
# writeLines(text = as.character(round(min(cands$cfscore,na.rm=T),1)),con="min_cfscore.tex")

head(manualcheck)
length(unique(manualcheck$elec_name)) # 4239 unique candidates that need checking

# Cut down to actually unique contributors to pick from:
manualcheck2 <- manualcheck[!duplicated(manualcheck[,c("bonica_cid","city","state_abb","occupation","cfscore","FirstNameMILastName")]),]
manualcheck2 <- manualcheck[!duplicated(manualcheck[,c("bonica_cid","occupation","cfscore","elec_name")]),]
length(unique(manualcheck2$elec_name)) # 4239
length(unique(manualcheck2$elec_name[manualcheck2$variable=="cand1"])) # 2525
dim(manualcheck2) # 31612 rows, but still only 2525 people to actually check

write.csv(manualcheck2,"manual_check_cfscores_mm.csv")
write.csv(cands,"county_elections_cfscores_mm.csv")

  